Sanorama Hie et al., 2019
GitHub
Tutorial external API
External external API tutorial
A fix to run scran pooling normalization computeSumFactors in current python environment.
import scanpy as sc
import scanorama
import numpy as np
import pandas as pd
import os
# Working directory
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')
# rpy2
os.environ['R_HOME'] = '/home/fdeckert/bin/miniconda3/envs/p.3.8.12-FD20200109SPLENO/lib/R'
# Communication
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) # Ignore R warnings
# Convertion
from rpy2.robjects import pandas2ri
import anndata2ri
# Activate conversion globaly
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
%%R
library(scran)
# Plotting
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}
sc.set_figure_params(figsize=(5, 5))
adata = sc.read_h5ad('data/object/qc.h5ad')
adata = adata[adata.obs['doublet_class']=='Singlet'].copy()
def set_color(categories):
categories = [x for x in categories if x in list(adata.obs.columns)]
for category in categories:
adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
keys = list(color[category].keys())
keys = [x for x in keys if x in list(adata.obs[category])]
adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)
# Set colors
set_color(list(color.keys()))
adata_sub = dict()
for sample_group in adata.obs['sample_group'].unique():
adata_sub[sample_group] = adata[adata.obs['sample_group']==sample_group].copy()
adata_sub = list(adata_sub.values())
def scran_input(adata):
adata_pp = adata.copy()
# For computeSumFactors we need a good coverage for each gene
sc.pp.filter_genes(adata_pp, min_cells=20)
# Compute groups
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack')
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=1)
input_groups = adata_pp.obs['groups']
# Set count matrix
data_mat = adata.X.T
data = {'input_groups': input_groups, 'data_mat': data_mat}
return(data)
scran_input = [scran_input(x) for x in adata_sub]
data_mat = scran_input[0]['data_mat']
input_groups = list(scran_input[0]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[0].obs['size_factors'] = size_factors
data_mat = scran_input[1]['data_mat']
input_groups = list(scran_input[1]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[1].obs['size_factors'] = size_factors
data_mat = scran_input[2]['data_mat']
input_groups = list(scran_input[2]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[2].obs['size_factors'] = size_factors
data_mat = scran_input[3]['data_mat']
input_groups = list(scran_input[3]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[3].obs['size_factors'] = size_factors
adata_sub[0].X /= adata_sub[0].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[0])
adata_sub[1].X /= adata_sub[1].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[1])
adata_sub[2].X /= adata_sub[2].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[2])
adata_sub[3].X /= adata_sub[3].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[3])
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=100, knn=50)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
# Test if X_scanorama is in same order as adata
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
Found 14772 genes among all datasets [[0. 0.94417643 0.68701931 0.54330401] [0. 0. 0.49862164 0.92246726] [0. 0. 0. 0.64848485] [0. 0. 0. 0. ]] Processing datasets (0, 1) Processing datasets (1, 3) Processing datasets (0, 2) Processing datasets (2, 3) Processing datasets (0, 3) Processing datasets (1, 2)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=150, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=50, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
adata_set = adata_sub.copy()
adata_sub = adata_set.copy()
sc.pp.scale(adata_sub[0])
sc.pp.scale(adata_sub[1])
sc.pp.scale(adata_sub[2])
sc.pp.scale(adata_sub[3])
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=100, knn=50)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
# Test if X_scanorama is in same order as adata
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
Found 14772 genes among all datasets [[0. 0.83942109 0.57488479 0.61192571] [0. 0. 0.45416954 0.79186768] [0. 0. 0. 0.84068435] [0. 0. 0. 0. ]] Processing datasets (2, 3) Processing datasets (0, 1) Processing datasets (1, 3) Processing datasets (0, 3) Processing datasets (0, 2) Processing datasets (1, 2)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=150, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=50, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
adata_sub = adata_set.copy()
sc.pp.regress_out(adata_sub[0], keys=['pHb_RNA'])
sc.pp.regress_out(adata_sub[1], keys=['pHb_RNA'])
sc.pp.regress_out(adata_sub[2], keys=['pHb_RNA'])
sc.pp.regress_out(adata_sub[3], keys=['pHb_RNA'])
sc.pp.scale(adata_sub[0])
sc.pp.scale(adata_sub[1])
sc.pp.scale(adata_sub[2])
sc.pp.scale(adata_sub[3])
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=100, knn=50)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
# Test if X_scanorama is in same order as adata
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
Found 14772 genes among all datasets [[0. 0.77532736 0.64631336 0.50244379] [0. 0. 0.55203308 0.93866299] [0. 0. 0. 0.89091194] [0. 0. 0. 0. ]] Processing datasets (1, 3) Processing datasets (2, 3) Processing datasets (0, 1) Processing datasets (0, 2) Processing datasets (1, 2) Processing datasets (0, 3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=150, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=50, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
# adata.write('data/object/int.h5ad')
# import sys
# sys.path.insert(0, '../scFacility/script')
# from dirFacility import adata2dir
# adata = adata.raw.to_adata()
# adata.layers['counts'] = adata.X
# adata2dir(adata, 'data/object/int/', assay="RNA", layers="counts", build_dir=True, overwrite=True)